ggplotlibrary(ggplot2)
housing <- read.csv("./dataSets/landdata-states.csv")
head(housing, 5)
## State region Date Home.Value Structure.Cost Land.Value
## 1 AK West 2010.25 224952 160599 64352
## 2 AK West 2010.50 225511 160252 65259
## 3 AK West 2009.75 225820 163791 62029
## 4 AK West 2010.00 224994 161787 63207
## 5 AK West 2008.00 234590 155400 79190
## Land.Share..Pct. Home.Price.Index Land.Price.Index Year Qrtr
## 1 28.6 1.481 1.552 2010 1
## 2 28.9 1.484 1.576 2010 2
## 3 27.5 1.486 1.494 2009 3
## 4 28.1 1.481 1.524 2009 4
## 5 33.8 1.544 1.885 2007 4
ggplot VS. Basic Plots# With base graph
hist(housing$Home.Value)
# With ggplot
ggplot(housing, aes(x = Home.Value)) +
geom_histogram()
# With base graph
plot(Home.Value ~ Date, data = subset(housing, State=='MA'))
points(Home.Value ~ Date, data = subset(housing, State=='TX'), col='red')
legend(1975, 400000, title = 'State',
c('MA','TX'), col=c('black','red'), pch=c(1,1))
# With ggplot
ggplot(data = subset(housing, State %in% c('MA','TX')),
aes(x=Date, y=Home.Value, color=State)) +
geom_point()
A. Aesthetic Mapping: In ggplot, the “aesthetic” means “something you can see”. For example: + position (e.g. on the x and y axis) + color (outside color) + fill (inside color) + shape + linetype + size
B. Geometric Objects: These are the actual marks that we put onto a plot. For example: + points + lines + boxplot… A plot must have at least one geom object, with no upper limit.
help.search('geom_', package='ggplot2')
house2001Q1 <- subset(housing, Date==2001.25)
ggplot(house2001Q1, aes(x=Land.Value, y=Structure.Cost)) + # mapping for x and y
geom_point()
house2001Q1$pred_sc <- predict(lm(Structure.Cost ~ log(Land.Value), house2001Q1))
ggplot(house2001Q1, aes(x=log(Land.Value), y=Structure.Cost)) +
geom_point(aes(color=Home.Value)) +
geom_line(aes(y=pred_sc))
ggplot(house2001Q1, aes(x=log(Land.Value), y=Structure.Cost)) +
geom_point(aes(color=Home.Value), size=1) + # !!! size, the fixed aes, is set outside the "aes".
geom_line(aes(y=pred_sc)) +
geom_smooth()
ggplot(house2001Q1, aes(x=log(Land.Value), y=Structure.Cost)) +
geom_point(aes(color=Home.Value, shape=region)) + # !!! mapping with other fields
geom_text(aes(label=State), size=2, hjust=0.5, vjust=-0.9) # !!! size & hjust & vjust, the fixed aes, are set outside the "aes".
# Human Development Index (HDI) & Corruption Perception Index (CPI)
dat <- read.csv("./dataSets/EconomistData.csv")
head(dat, 3)
## X Country HDI.Rank HDI CPI Region
## 1 1 Afghanistan 172 0.398 1.5 Asia Pacific
## 2 2 Albania 70 0.739 3.1 East EU Cemt Asia
## 3 3 Algeria 96 0.698 2.9 MENA
ggplot(dat, aes(x=CPI, y=HDI)) +
# geom_point(color='blue') +
# geom_point(aes(color=Region), size=2)
geom_point(aes(color=Region, size=HDI.Rank))
stat functions can be passed through geom_ functions. But in order to change it, you have to first determine which stat the geom_ uses, then determine the arguments to that stat.ggplot(housing, aes(Home.Value)) +
geom_histogram()
To change the width of each bars in the histogram, we need to state stat_bin first…
ggplot(housing, aes(Home.Value)) +
geom_histogram(stat='bin', binwidth=4000)
In geom_bar, the stat is count by default.
housing_agg <- aggregate(Home.Value~State + region, data=housing, FUN = mean)
ggplot(housing_agg, aes(x=State, y=Home.Value)) +
geom_bar(aes(color=region), stat = 'identity') # specify the 'stat' value
ggplot(dat, aes(x=CPI, y=HDI)) +
geom_point() +
geom_smooth(method='loess', span=0.3) # "lm", "glm", "gam", "loess", "rlm"
# geom_line(stat='smooth', method='loess')
scale (scale_<aesthetic>_<type>).scale in ggplot2 includes:ggplot(housing, aes(x=State, y=Home.Price.Index)) +
theme(legend.position = 'top', axis.text = element_text(size=6)) +
geom_point(aes(color=Date),
alpha=0.5,
size=1,
position=position_jitter(width=0.15, height=0)) + # adding random noise to a plot to make it easier to read
scale_x_discrete('State Abbreviation') +
scale_color_continuous(name='Date',
breaks=c(1976, 1994, 2010), labels=c("'76", "'94", "'10"),
low='blue', high='red')
scale_color_gradient2 to interpolate between three different colors.ggplot(housing, aes(x=State, y=Home.Price.Index)) +
theme(legend.position = 'top', axis.text = element_text(size=5)) +
geom_point(aes(color=Date), alpha=0.5, size=1,
position=position_jitter(width = 0.1, height = 0)) +
scale_color_gradient2(name="",
breaks=c(1976, 1994, 2013),
labels=c("'76","'94","'13"),
low='blue', high='red', mid='grey60', midpoint = 1994)
scale_color_scale_fill_scale_size_scale_shape_scale_linetype_scale_x_scale_y_ggplot(dat, aes(x=CPI, y=HDI)) +
theme(text=element_text(size=9)) +
geom_point(aes(color=Region)) +
scale_x_continuous(name="Corruption Perception Index") +
scale_y_continuous(name="Human Development Index") +
scale_color_manual(name="Region of World",
values=c("#24576D",
"#099DD7",
"#28AADC",
"#248E84",
"#F2583F",
"#96503F"))
# Without faceting, all the lines are crowding together and it's hard to distinguish by state.
ggplot(housing, aes(x=Date, y=Home.Value)) +
theme(text = element_text(size=8)) +
geom_line(aes(color=State))
# With faceting, all the
ggplot(housing, aes(x=Date, y=Home.Value)) +
theme(text = element_text(size=8)) +
geom_line() +
facet_wrap(~State, ncol=10)
library(tidyr)
housing.byyear <- aggregate(cbind(Home.Value, Land.Value) ~ Date, data=housing, mean)
home.land.byyear <- gather(housing.byyear, value = "value", key = "type", Home.Value, Land.Value)
ggplot(home.land.byyear, aes(x=Date, y=value, color=type)) +
geom_line()
library(ggthemes)
library(ggrepel)
dat <- read.csv("./dataSets/EconomistData.csv")
pointsToLabel <- c("Russia", "Venezuela", "Iraq", "Myanmar", "Sudan",
"Afghanistan", "Congo", "Greece", "Argentina", "Brazil",
"India", "Italy", "China", "South Africa", "Spane",
"Botswana", "Cape Verde", "Bhutan", "Rwanda", "France",
"United States", "Germany", "Britain", "Barbados", "Norway", "Japan",
"New Zealand", "Singapore")
dat$Region <- factor(dat$Region,
levels = c("EU W. Europe",
"Americas",
"Asia Pacific",
"East EU Cemt Asia",
"MENA",
"SSA"),
labels = c("OECD",
"Americas",
"Asia &\nOceania",
"Central &\nEastern Europe",
"Middle East &\nnorth Africa",
"Sub-Saharan\nAfrica"))
ggplot(dat, aes(x=CPI, y=HDI)) +
theme(legend.position = "top", text=element_text(size=8)) +
geom_smooth(mapping=aes(linetype='R2'),
method='lm', formula=y~x+log(x), se=FALSE,
color='#DA3C2A', size=0.7) +
geom_point(aes(color=Region), shape=21, size=2, stroke=1.5) +
# labelling points
geom_text_repel(aes(label=Country),
color='grey20',
data=subset(dat, Country %in% pointsToLabel),
size=2.5,
force=10) +
scale_x_continuous(name="Human Development Index, 2011 (1=best)", limits = c(0.9,10.5), breaks = 1:10) +
scale_y_continuous(name="Corruption Perceptions Index, 2011 (10=least corrupt)", limits = c(0.2,1.0), breaks = seq(0.1, 1, by=0.1)) +
scale_color_manual(name="", values = c("#24576D","#099DD7","#28AADC","#248E84","#F2583F","#96503F"), guide=guide_legend(nrow = 1)) +
ggtitle("Corruption and Human Development")
# + theme_bw()
mR2 <- summary(lm(HDI ~ CPI + log(CPI), data = dat))$r.squared
mR2 <- paste0(format(mR2, digits = 2), "%")
library(ggthemes)
library(ggrepel)
ggplot(dat, aes(x=CPI, y=HDI)) +
geom_smooth(mapping=aes(linetype="r2"), method='lm', formula=y~x+log(x), se=FALSE, color='#DA3C2A', size=0.8) +
geom_point(aes(color=Region), shape=1, stroke=1.5) +
geom_text_repel(mapping=aes(label=Country, alpha=labels),
data=transform(dat, labels=Country %in% c("Russia", "Venezuela", "Iraq", "Myanmar", "Sudan",
"Afghanistan", "Congo", "Greece", "Argentina", "Brazil",
"India", "Italy", "China", "South Africa", "Spane",
"Botswana", "Cape Verde", "Bhutan", "Rwanda", "France",
"United States", "Germany", "Britain", "Barbados", "Norway", "Japan",
"New Zealand", "Singapore")),
color='gray40',
segment.color='gray80',
size=3) +
scale_alpha_discrete(range=c(0,1), guide=FALSE) +
scale_x_continuous(name="Corruption Preception Index, 2011 (10=least corrupt)",
limits=c(1.0, 10.0),
breaks=1:10) +
scale_y_continuous(name="Human Development Index, 2011 (1=best)",
limits=c(0.2, 1.0),
breaks=seq(0.2,1,by=0.1)) +
scale_color_manual(name="",
values=c("#24576D","#099DD7","#28AADC","#248E84","#F2583F","#96503F"),
guide=guide_legend(nrow=1)) +
scale_linetype(name='',
breaks='r2',
labels=list(bquote(R^2==.(mR2))),
guide=guide_legend(override.aes = list(linetype=1, size=2, color="#DA3C2A"))) +
ggtitle("Corruption and human development") +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid = element_blank(),
panel.grid.major.y = element_line(color='gray'),
#panel.grid.major.x = element_line(color='gray'),
axis.line.x = element_line(color='gray'),
axis.text = element_text(face='italic'),
axis.title.x = element_text(face='italic', size=8, color='gray20'),
axis.title.y = element_text(face='italic', size=8, color='gray20'),
legend.position = 'top',
legend.direction = 'horizontal',
legend.box= 'horizontal',
legend.text = element_text(size=8,color='gray20'),
plot.title = element_text(size=13, face = 'bold')
)